{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4dc831a0",
   "metadata": {},
   "source": [
    "### 单机多卡训练"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "de89501a",
   "metadata": {},
   "source": [
    "<font color=\"red\">我在windows运行命令，一直报错：ValueError: You are going to using gloo on windows, but currently is not supported</font>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50d67a9c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ce47b01f",
   "metadata": {},
   "source": [
    "#### 高层API场景"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a0947304",
   "metadata": {},
   "source": [
    ">[paddle.distributed.launch](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/distributed/launch_cn.html#cn-api-distributed-launch)\n",
    ">\n",
    ">当调用 paddle.Model 高层API实现训练时，想要启动单机多卡训练非常简单，代码不需要做任何修改，只需要在启动的时候加上参数 -m paddle.distributed.launch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "206f0528",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 单机单卡启动，默认使用第0号卡\n",
    "python train.py\n",
    "# 单机多卡启动，默认使用当前可见的所有的卡\n",
    "python -m paddle.distributed.launch train.py\n",
    "# 单机多卡，指定使用第0号卡和第1号卡\n",
    "python -m paddle.distributed.launch -gpus '0,1' train.py\n",
    "\n",
    "# 单机多卡，指定使用第0号卡和第1号卡\n",
    "export CUDA_VISIBLE_DEVICES=0,1\n",
    "python -m paddle.distributed.launch train.py\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ca27fa0b",
   "metadata": {},
   "source": [
    "#### 基础API场景"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1fad766a",
   "metadata": {},
   "source": [
    "使用基础API实现训练，需要修改3处：\n",
    ">第一处，导入分布式训练所需要的包 import paddle.distributed as dist\n",
    ">\n",
    ">第二处，初始化并行环境 dist.init_parallel_env()\n",
    ">\n",
    ">第三处，用 paddle.DataParallel(mnist) 封装模型\n",
    ">"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4189eafd",
   "metadata": {},
   "outputs": [],
   "source": [
    "import paddle\n",
    "# 第一步，导入分布式计算包\n",
    "import paddle.distributed as dist\n",
    "from paddle.vision.transforms import ToTensor\n",
    "\n",
    "# 第二步，初始化运行环境\n",
    "dist.init_parallel_env()\n",
    "# 加载数据集\n",
    "train_dataset = paddle.vision.MNIST(mode=\"train\",transform=ToTensor())\n",
    "test_dataset = paddle.vision.MNIST(mode=\"test\",transform=ToTensor())\n",
    "# 模型组网\n",
    "mnist = paddle.nn.Sequential(\n",
    "    paddle.nn.Flatten(1,-1),\n",
    "    paddle.nn.Linear(784,512),\n",
    "    paddle.nn.ReLU(),\n",
    "    paddle.nn.Dropout(0.2),\n",
    "    paddle.nn.Linear(512,10)\n",
    ")\n",
    "# 用 DataLoader 加载数据\n",
    "train_loader = paddle.io.DataLoader(train_dataset,batch_size=32,shuffle=True)\n",
    "\n",
    "# 第三步，用paddle.DataParallel 封装模型\n",
    "mnist = paddle.DataParallel(mnist)\n",
    "\n",
    "mnist.train()\n",
    "\n",
    "\n",
    "epochs = 5\n",
    "# 优化器\n",
    "optim = paddle.optimizer.Adam(parameters=mnist.parameters())\n",
    "\n",
    "for epoch in range(epochs):\n",
    "    for batch_id,data in enumerate(train_loader()):\n",
    "        x_data = data[0]\n",
    "        y_data = data[1]\n",
    "        predicts = mnist(x_data)\n",
    "\n",
    "        # 损失函数\n",
    "        loss = paddle.nn.functional.cross_entropy(predicts,y_data)\n",
    "        # 准确率\n",
    "        acc = paddle.metric.accuracy(predicts,y_data)\n",
    "        # 反向传播更新参数，并清除之前的梯度信息\n",
    "        loss.backward()\n",
    "        optim.step()\n",
    "        optim.clear_grad()\n",
    "\n",
    "        # 打印信息\n",
    "        if (batch_id+1 % 900) == 0:\n",
    "            print(f\"epoch：{epoch}，batch_id：{batch_id}，loss：{loss.numpy()}，acc：{acc.numpy()}\")\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ad9722b1",
   "metadata": {},
   "source": [
    "#### spawn启动"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "79346bc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import paddle\n",
    "import paddle.nn as nn\n",
    "import paddle.optimizer as opt\n",
    "import paddle.distributed as dist"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e0fc2e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from __future__ import print_function\n",
    "\n",
    "import paddle\n",
    "import paddle.nn as nn\n",
    "import paddle.optimizer as opt\n",
    "import paddle.distributed as dist\n",
    "\n",
    "class LinearNet(nn.Layer):\n",
    "    def __init__(self):\n",
    "        super(LinearNet, self).__init__()\n",
    "        self._linear1 = nn.Linear(10, 10)\n",
    "        self._linear2 = nn.Linear(10, 1)\n",
    "\n",
    "    def forward(self, x):\n",
    "        return self._linear2(self._linear1(x))\n",
    "\n",
    "def train(print_result=False):\n",
    "\n",
    "    # 1. 初始化并行训练环境\n",
    "    dist.init_parallel_env()\n",
    "\n",
    "    # 2. 创建并行训练 Layer 和 Optimizer\n",
    "    layer = LinearNet()\n",
    "    dp_layer = paddle.DataParallel(layer)\n",
    "\n",
    "    loss_fn = nn.MSELoss()\n",
    "    adam = opt.Adam(\n",
    "        learning_rate=0.001, parameters=dp_layer.parameters())\n",
    "\n",
    "    # 3. 运行网络\n",
    "    inputs = paddle.randn([10, 10], 'float32')\n",
    "    outputs = dp_layer(inputs)\n",
    "    labels = paddle.randn([10, 1], 'float32')\n",
    "    loss = loss_fn(outputs, labels)\n",
    "\n",
    "    if print_result is True:\n",
    "        print(\"loss:\", loss.numpy())\n",
    "\n",
    "    loss.backward()\n",
    "\n",
    "    adam.step()\n",
    "    adam.clear_grad()\n",
    "\n",
    "# 使用方式1：仅传入训练函数\n",
    "# 适用场景：训练函数不需要任何参数，并且需要使用所有当前可见的GPU设备并行训练\n",
    "if __name__ == '__main__':\n",
    "    dist.spawn(train)\n",
    "\n",
    "# 使用方式2：传入训练函数和参数\n",
    "# 适用场景：训练函数需要一些参数，并且需要使用所有当前可见的GPU设备并行训练\n",
    "if __name__ == '__main__':\n",
    "    dist.spawn(train, args=(True,))\n",
    "\n",
    "# 使用方式3：传入训练函数、参数并指定并行进程数\n",
    "# 适用场景：训练函数需要一些参数，并且仅需要使用部分可见的GPU设备并行训练，例如：\n",
    "# 当前机器有8张GPU卡 {0,1,2,3,4,5,6,7}，此时会使用前两张卡 {0,1}；\n",
    "# 或者当前机器通过配置环境变量 CUDA_VISIBLE_DEVICES=4,5,6,7，仅使4张\n",
    "# GPU卡可见，此时会使用可见的前两张卡 {4,5}\n",
    "if __name__ == '__main__':\n",
    "    dist.spawn(train, args=(True,), nprocs=2)\n",
    "\n",
    "# 使用方式4：传入训练函数、参数、指定进程数并指定当前使用的卡号\n",
    "# 使用场景：训练函数需要一些参数，并且仅需要使用部分可见的GPU设备并行训练，但是\n",
    "# 可能由于权限问题，无权配置当前机器的环境变量，例如：当前机器有8张GPU卡\n",
    "# {0,1,2,3,4,5,6,7}，但你无权配置CUDA_VISIBLE_DEVICES，此时可以通过\n",
    "# 指定参数 gpus 选择希望使用的卡，例如 gpus='4,5'，\n",
    "# 可以指定使用第4号卡和第5号卡\n",
    "if __name__ == '__main__':\n",
    "    dist.spawn(train, nprocs=2, gpus='4,5')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "571d79c2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "68a79d80",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f29b82d2",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da9f2bf7",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97ba6141",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a73e0dee",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef6a069d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
